suppressPackageStartupMessages({
library(SummarizedExperiment)
library(SEtools)
library(ggplot2)
library(plgINS)
library(edgeR)
library(DT)
library(viridis)
library(RColorBrewer)
library(data.table)
library(dplyr)
library(tibble)
library(cowplot)
library(matrixStats)
library(RUVSeq)
})## Warning: replacing previous import 'GenomicRanges::shift' by 'data.table::shift'
## when loading 'plgINS'
# input file: Bartel DEA SE
#input <- "data/bartel.hela.DEA.SE.rds"
input <- "data/bartel.hek.DEA.SE.rds"
# output file: enrichMiR results
#output.e <- "data/bartel.hela.e.TSperm.rds"
output.e <- "data/bartel.hek.e.TSperm.rds"
# number of cores for high runtime functions
cores <- 5
# set number of replicates per permutation proportion
nrep <- 3# load enrichMiR package
devtools::load_all("/mnt/schratt/tgermade_test/master_19_20/enrichMiR/enrichMiR/")#' getDEA
#'
#' @param dea.df dataframe of DEA results (an SE rowData column if generated via DEA() function)
#'
#' @return FDR-filtered dataframe containing c("symbol","logFC","PValue","FDR") columns
#'
getDEA <- function(dea.df){
if(!any(colnames(dea.df) %in% "symbol")) dea.df$symbol <- rownames(dea.df)
dea <- as.data.frame(dea.df[,c("symbol","logFC", "logCPM","PValue","FDR")])
dea <- aggregate(dea[,-1], by=list(symbol=dea$symbol),FUN=mean)
rownames(dea) <- dea$symbol
dea <- dea[,-1]
lapply(dea, FUN = function(x){
if(any(is.infinite(x))){
w <- which(is.infinite(x))
x[w] <- max(abs(x[setdiff(which(dea$FDR<0.5),w)]))*sign(x[w])
}
})
return(dea)
}# generate dea object (high runtime!)
#dea.list <- lapply(dea.df.names, FUN = function(x) getDEA(rowData(se.hela)[[x]]))
dea.list <- bplapply(dea.list, getDEA,
BPPARAM = MulticoreParam(cores, progressbar = FALSE) )
names(dea.list) <- dea.names# we don't use miRNA expression values for the benchmark. This will make it more difficult # for the enrichMiR tests
mirexpr <- NULL#' getTS
#'
#' @param species character object. Can be "human", "mouse" or "rat"
#'
#' @return TargetScan miRNA target dataframe with family information in metadata()
#'
getTS <- function(species=c("human","mouse","rat")){
library(S4Vectors)
species <- match.arg(species)
# assign species ID
spec <- switch( species,
human = 9606,
mouse = 10090,
rat = 10116 )
# download TargetScan miRNA targeting dataset
tmp <- tempfile()
download.file(
"http://www.targetscan.org/vert_72/vert_72_data_download/Summary_Counts.all_predictions.txt.zip",
tmp)
unzip(tmp)
full <- fread("Summary_Counts.all_predictions.txt") #, sep = "\t", header = TRUE)
# limit to selected species
sub <- full[full$'Species ID' == spec,]
sub$score <- as.numeric(as.character(sub$'Cumulative weighted context++ score'))
# generate TargetScan dataframe
ts <- DataFrame(family = sub$'miRNA family',
rep.miRNA = sub$'Representative miRNA',
feature = sub$'Gene Symbol',
sites = sub$'Total num conserved sites',
score = as.numeric(as.character(sub$'Cumulative weighted context++ score'))
)
ts <- DataFrame(
aggregate(sub[,c("sites","score")], by=ts[,c("family","feature")], FUN=mean)
)
# download TargetScan miRNA families dataset
tmp <- tempfile()
download.file(
"http://www.targetscan.org/vert_72/vert_72_data_download/miR_Family_Info.txt.zip",
tmp)
unzip(tmp)
full <- fread("miR_Family_Info.txt") #, sep = "\t", header = TRUE)
# limit to selected species
sub <- full[full$'Species ID' == spec,]
fam <- sub$`Seed+m8`
names(fam) <- sub$`MiRBase ID`
# add family info to ts dataframe as attribute
metadata(ts)$families <- fam
# enrichMiR cant handle 0 values for sites feature
return(ts[ts$sites!=0,])
}# load detailed Bartel treatment info containing exact sequences
pert <- read.csv("data/bartel_treatments.txt", sep = "\t")
# we extract HeLa & HEK data (we disregard passenger strands as long as we're working with TargetScan)
colnames(pert) <- c("treatment","seq")
pert <- list(pert[1:17,], pert[37:48,])
names(pert) <- c("hela","hek")
if(grepl("hela", input)){
pert <- pert$hela
} else if(grepl("hek", input)){
pert <- pert$hek
}
# find out where the seed sequence begins
#unlist(gregexpr(pattern ="GAGGUAG",pert$seq))
#unlist(gregexpr(pattern ="GGAAUGU",pert$seq))
# extract seed sequences
pert$seed <- sapply(pert$seq, function(x) substring(x ,2, 8))
# get seed family for each treatment miRNA (true positive)
TP <- sapply(dea.names, FUN=function(x) unique(as.character(
pert$seed[grepl(paste0(x,"\\("), pert$treatment) | grepl(paste0(x,"$"), pert$treatment)]
)) )#' TSperm
#'
#' @param TS TargetScan DataFrame of miRNA tx targets
#' @param genes string vector of gene names
#' @param props vector of proportions, e.g. c(.2,.3,.4,.5)
#'
#' @return TS DataFrame with permuted miRNA targets
#'
TSperm <- function(TS, TP, genes, props){
# get TargetScan data for each treatment miRNA family
TS.part <- TS[TS$family==TP,]
lapply(props, FUN=function(p){
sn <- floor(p*nrow(TS.part))
genes <- genes[!genes %in% TS.part$feature]
i <- sample(seq_len(nrow(TS.part)), sn)
j <- sample(seq_len(length(genes)), sn)
TS.part$feature[i] <- as.character(genes[j])
TS <- rbind(TS.part, TS[TS$family!=TP,])
return(TS)
})
}# the permuatation proportions that should be computed for each DEA
names(i) <- i <- 1:nrep
props <- unlist(lapply( c("20"=0.2, "30"=0.3, "40"=0.4, "50"=0.5),
FUN=function(x) lapply(i, FUN=function(y) x)))
# do TS permutations
set.seed(1234)
TS.list <- bplapply(dea.names, FUN=function(i){
TSperm(TS, TP[i], rownames(se), props)
}, BPPARAM = MulticoreParam(cores, progressbar = FALSE) )
# naming
names(TS.list) <- dea.names
for(i in dea.names){
names(TS.list[[i]]) <- names(props)
}
# add original TS to permutated ones
for(i in dea.names){
TS.list[[i]][["original"]] <- TS
}
# place original at first place
for(i in dea.names){
TS.list[[i]] <- TS.list[[i]][c("original", names(props))]
}
props.all <- c(original="original", props)tests <- c("areamir","overlap","michael","KS","KS2","MW")
# all tests except WO:
tests <- c("overlap","michael","mw","ks","ks2","areamir","areamir2")
# all tests
tests <- c("overlap","michael","wo","mw","ks","ks2","gsea","gseamod","modscore","modsites","areamir","areamir2")
tests <- NULL
cores <- 8
# run enrichMiR on all objects of dea.list (high runtime!)
## foreach: doesnt work
library(foreach)
library(doParallel)
# cl <- makeCluster(cores)
# registerDoParallel(cl, cores=cores)
# chunk.size <- length(dea.names)/cores
# test <- foreach(c=1:cores, .combine="rbind") %dopar% {
# for(i in dea.names[((c-1)*chunk.size+1):(c*chunk.size)]){
# lapply(names(props.all), FUN=function(j){
# enrichMiR(DEA=as.data.frame(dea.list[[i]]), TS=TS.list[[i]][[j]], miRNA.expression=mirexpr,
# cleanNames=TRUE, tests=tests)
# })
# }
# }
#stopImplicitCluster()
# stopCluster(cl)
## foreach: Pierre's code (works, but 1 core)
res <- foreach( dea=dea.list,
TS=unlist(TS.list,recursive=FALSE) ) %dopar% {
enrichMiR( as.data.frame(dea), TS=TS, miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests )
}
## mclapply (works, but 1 core; all cores when reduced tests var)
## looks like GSEA, WO, mods are problematic
library(parallel)
# multicores on Linux
e.list <- mclapply(dea.names, FUN=function(i){
lapply(names(props.all), FUN=function(j){
enrichMiR(DEA=as.data.frame(dea.list[[i]]), TS=TS.list[[i]][[j]], miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests)
})
}, mc.cores = cores )
## mcmapply: inspired by Pierre's code
res <- mcmapply(dea=dea.list,
TS=unlist(TS.list,recursive=FALSE),
FUN=function(x){
enrichMiR( as.data.frame(dea), TS=TS, miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests )
}, mc.cores = cores )
## bplapply: doesnt work at all
e.list <- bplapply(dea.names[1:4], FUN=function(i){
lapply(names(props.all), FUN=function(j){
enrichMiR(DEA=as.data.frame(dea.list[[i]]), TS=TS.list[[i]][[j]], miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests)
})
}, BPPARAM = MulticoreParam(cores, progressbar = TRUE) )
## bpmapply: Pierre's code
res <- bpmapply(dea=dea.list,
TS=unlist(TS.list,recursive=FALSE),
BPPARAM=MulticoreParam(4, threshold="TRACE"), FUN=function(x){
enrichMiR( as.data.frame(dea), TS=TS, miRNA.expression=mirexpr, cleanNames=TRUE, tests=tests )
} )
## serial run
e.list <- lapply(dea.names, FUN=function(i){
lapply(names(props.all), FUN=function(j){
enrichMiR(DEA=as.data.frame(dea.list[[i]]), TS=TS.list[[i]][[j]], miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests)
})
})
# naming
names(e.list) <- dea.names
for(i in dea.names){
names(e.list[[i]]) <- names(props.all)
}
# use this between parallel runs
gc(verbose = T)dat <- function(e, thresholds, TP){
dplyr::bind_rows(
lapply(e@res, FUN=function(x){
if("family" %in% colnames(x)) row.names(x) <- x$family
x$truth <- row.names(x) %in% TP
x$FDR[is.na(x$FDR)] <- 1
as.data.frame(
t(sapply(thresholds, FUN=function(i){
P <- sum(x$FDR<=i)
c( threshold=i,
P=P,
TP=sum(x$FDR<=i & x$truth),
FP=sum(x$FDR<=i & !x$truth),
TPR=sum(x$FDR<=i & x$truth)/sum(x$truth),
FPR=sum(x$FDR<=i & !x$truth)/sum(!x$truth),
FDR=ifelse(P>0,sum(x$FDR<=i & !x$truth)/P,0) )
}))
)
}), .id="method")
}# Benchmarking
devtools::load_all("../enrichMiR/enrichMiR/")
fams <- metadata(TS)$families
#dea <- deas[[1]]
#dea <- rowData(se.hela)$`DEA.let-7a`
#e <- enrichMiR(dea, TS)
#e <- e.list$`miR-122`
#TP <- unique(as.character(fams[grep("miR-144$|miR-144-",names(fams))]))
thresholds <- c(0,10^(-10:-3),0.005,0.01,0.025,0.05,0.075,0.1,0.15,0.2,0.25,0.3,0.5)
# get benchmark results
dat.list <- lapply(dea.names[1], FUN=function(x) lapply(names(e.list[[x]]), FUN=function(y) dat(e.list[[x]][[y]], thresholds, TP.list[[x]]) ))
ggplot(dat.list[[1]][[2]], aes(FPR, TPR, colour=method)) + geom_line() + geom_point(size=3) +
scale_x_log10()
#+ scale_colour_manual(values=cols)#' doBenchmark
#'
#' @param res enrichMiR test results (e object)
#' @param TP character vector: contains the seeds (families) for a miRNA treatment
#'
#' @return a dataframe containing scores for each enrichMiR test
#'
doBenchmark <- function(res, TP){
res <- lapply(res, FUN=function(x){
x <- x[order(x$FDR),]
if("family" %in% colnames(x)) row.names(x) <- x$family
x$truth <- row.names(x) %in% TP
x$FDR[is.na(x$FDR)] <- 1
x
})
data.frame( method=names(res),
detPPV = sapply(res, FUN=function(x) 1/which(x$truth)[1] ),
FP.atFDR05 = sapply(res, FUN=function(x) sum(!x$truth & x$FDR<0.05)),
log10QDiff = sapply(res, FUN=function(x){
tp1 <- -log10(x$FDR[which(x$truth)[1]])
fp1 <- -log10(x$FDR[which(!x$truth)[1]])
tp1-fp1
}),
log10QrelIncrease = sapply(res, FUN=function(x){
tp1 <- -log10(x$FDR[which(x$truth)[1]])
fp1 <- -log10(x$FDR[which(!x$truth)[1]])
(tp1-fp1)/(min(tp1,fp1))
}),
TP.atFDR05 = sapply(res, FUN=function(x) sum(x$truth[x$FDR<0.05]))
)
}# generate the benchmarking scores
BM.list <- lapply(dea.names, FUN=function(x){
lapply(names(e.list[[x]]), FUN=function(y){
doBenchmark(e.list[[x]][[y]]@res, TP[x])
})
})
# naming
names(BM.list) <- dea.names
for(x in dea.names){
names(BM.list[[x]]) <- names(e.list[[x]])
}
# generate a results df for plotting
BM.list2 <- lapply(BM.list, FUN=function(x) dplyr::bind_rows(x, .id = "prop.rep"))
BM.df <- dplyr::bind_rows(BM.list2, .id="treatment")
BM.df$prop <- unlist(lapply(strsplit(BM.df$prop.rep, "[.]"), FUN=function(x) x[1]))
BM.df$prop <- factor(BM.df$prop, levels=unique(BM.df$prop))
df <- BM.df# plots
for(i in c("detPPV","FP.atFDR05","log10QDiff","TP.atFDR05")){
cat("Score analysis: ", i,"\n\n")
## boxplot
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_boxplot() + ylab(i)
)
if(i!="TP.atFDR05"){
## density plot: score distribution per permutation
print(
ggplot(df, aes(log10(df[[i]]))) + geom_density(aes(fill=method), alpha=0.9) + facet_wrap(~prop) + xlab(i)
)
## violin plot
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_violin() + facet_wrap(~method) +
guides(fill=FALSE) + ylab(i) + geom_jitter(shape=16, position=position_jitter(0.2))
)
}
## violin plot median
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~method) +
guides(fill=FALSE) + ylab(i)
)
}## Score analysis: detPPV
## Score analysis: FP.atFDR05
## Score analysis: log10QDiff
## Score analysis: TP.atFDR05
# mean scores over all datasets; this shows which datasets were difficult to handle for the tests
ggplot(df, aes(x=prop, y=detPPV, fill=treatment)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~treatment) +
guides(fill=FALSE)mean.datasets <- lapply( unique(df$treatment), FUN=function(x)
sapply( unique(df$prop), function(y)
round( mean(df$detPPV[df$treatment==x & df$prop==y], na.rm=TRUE), 3)
)
)
names(mean.datasets) <- dea.names
for(i in dea.names){
names(mean.datasets[[i]]) <- unique(df$prop)
}
mean.datasets## $`miR-122`
## original 20 30 40 50
## 0.824 0.824 0.825 0.790 0.678
##
## $`miR-138`
## original 20 30 40 50
## 0.801 0.809 0.824 0.798 0.732
##
## $`miR-145`
## original 20 30 40 50
## 0.767 0.703 0.683 0.655 0.519
##
## $`miR-184`
## original 20 30 40 50
## 0.568 0.525 0.467 0.485 0.375
##
## $`miR-190a`
## original 20 30 40 50
## 0.771 0.761 0.715 0.706 0.481
##
## $`miR-200b`
## original 20 30 40 50
## 0.801 0.801 0.801 0.795 0.749
##
## $`miR-216a`
## original 20 30 40 50
## 0.656 0.453 0.269 0.158 0.109
##
## $`miR-217`
## original 20 30 40 50
## 0.729 0.709 0.670 0.538 0.287
##
## $`miR-219a`
## original 20 30 40 50
## 0.795 0.824 0.805 0.805 0.805
##
## $`miR-375`
## original 20 30 40 50
## 0.795 0.757 0.759 0.596 0.563
##
## $`miR-451a`
## original 20 30 40 50
## 0.648 0.619 0.601 0.431 0.463
# mean scores over all methods
ggplot(df, aes(x=prop, y=detPPV, fill=method)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~method) +
guides(fill=FALSE)mean.methods <- lapply( unique(df$method), FUN=function(x)
sapply( unique(df$prop), function(y)
round( mean(df$detPPV[df$method==x & df$prop==y], na.rm=TRUE), 3)
)
)
names(mean.methods) <- unique(df$method)
for(i in unique(df$method)){
names(mean.methods[[i]]) <- unique(df$prop)
}
mean.methods## $aREAmir
## original 20 30 40 50
## 0.955 0.955 0.876 0.794 0.609
##
## $EN.up
## original 20 30 40 50
## 0.013 0.013 0.012 0.013 0.012
##
## $EN.down
## original 20 30 40 50
## 1.000 0.939 0.906 0.795 0.577
##
## $EN.combined
## original 20 30 40 50
## 1.000 0.939 0.899 0.826 0.715
##
## $wEN.up
## original 20 30 40 50
## 0.006 0.006 0.012 0.007 0.007
##
## $wEN.down
## original 20 30 40 50
## 1.000 1.000 0.955 0.912 0.851
##
## $wEN.combined
## original 20 30 40 50
## 1.000 1.000 0.926 0.922 0.901
##
## $michael.up
## original 20 30 40 50
## 0.006 0.007 0.014 0.007 0.007
##
## $michael.down
## original 20 30 40 50
## 1.000 1.000 0.957 0.914 0.894
##
## $michael.combined
## original 20 30 40 50
## 1.000 1.000 0.951 0.864 0.743
##
## $MW
## original 20 30 40 50
## 0.865 0.786 0.770 0.630 0.439
##
## $KS
## original 20 30 40 50
## 0.716 0.691 0.658 0.538 0.364
##
## $KS2
## original 20 30 40 50
## 1.000 0.955 0.947 0.878 0.786
##
## $GSEA
## original 20 30 40 50
## 0.528 0.481 0.477 0.493 0.453
##
## $GSEAmodified
## original 20 30 40 50
## 0.447 0.380 0.400 0.350 0.314
##
## $modSites
## original 20 30 40 50
## 0.955 0.867 0.798 0.724 0.581
##
## $modScore
## original 20 30 40 50
## 1.000 0.902 0.841 0.731 0.644
# sensitivity score (ratio of TP at FDR 0.05)
sens.methods <- lapply( unique(df$method), FUN=function(x)
sapply( unique(df$prop), function(y)
round( sum(df$TP.atFDR05==1 & df$method==x & df$prop==y) / sum(df$method==x & df$prop==y), 3)
)
)
names(sens.methods) <- unique(df$method)
for(i in unique(df$method)){
names(sens.methods[[i]]) <- unique(df$prop)
}
sens.methods## $aREAmir
## original 20 30 40 50
## 0.818 0.667 0.545 0.515 0.303
##
## $EN.up
## original 20 30 40 50
## 0 0 0 0 0
##
## $EN.down
## original 20 30 40 50
## 1 1 1 1 1
##
## $EN.combined
## original 20 30 40 50
## 1.000 1.000 0.970 0.970 0.939
##
## $wEN.up
## original 20 30 40 50
## 0 0 0 0 0
##
## $wEN.down
## original 20 30 40 50
## 1 1 1 1 1
##
## $wEN.combined
## original 20 30 40 50
## 1.000 1.000 0.970 0.970 0.909
##
## $michael.up
## original 20 30 40 50
## 0 0 0 0 0
##
## $michael.down
## original 20 30 40 50
## 1.000 1.000 0.970 0.939 0.818
##
## $michael.combined
## original 20 30 40 50
## 1.000 1.000 0.970 0.909 0.727
##
## $MW
## original 20 30 40 50
## 1.00 1.00 1.00 1.00 0.97
##
## $KS
## original 20 30 40 50
## 1.00 1.00 1.00 1.00 0.97
##
## $KS2
## original 20 30 40 50
## 0.909 0.909 0.818 0.758 0.576
##
## $GSEA
## original 20 30 40 50
## 0.091 0.121 0.182 0.212 0.303
##
## $GSEAmodified
## original 20 30 40 50
## 0.364 0.364 0.485 0.545 0.576
##
## $modSites
## original 20 30 40 50
## 1.000 1.000 1.000 1.000 0.879
##
## $modScore
## original 20 30 40 50
## 1.000 1.000 1.000 1.000 0.879